Import der Daten und Vorverarbeitung

In [1]:
# import relevant modules
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
import plotly.express as px
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm
import plotly.graph_objects as go
import glob
import sys
sys.path.append('../scripts/')
from analysis import get_correlation, peak_analysis, peak_ranges
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
In [2]:
# load only hashtags which are relevant
topics_df = pd.read_json('../../data/BTW17_Twitter/lda/hashtag_topics.json')
hashtags = topics_df['hashtag'].tolist()
In [3]:
# load hashtag timeseries
hashtag_df = pd.read_json('../../data/BTW17_Twitter/hashtags/hashtag_counts.json')
hashtag_df.head(3)
Out[3]:
date hashtag count
0 2017-05-29 150jahrekapital 1
1 2017-05-29 a19 1
2 2017-05-29 abschiebung 14
In [4]:
# load politicans metadata and keep only relevant data
persons_df = pd.read_csv('../../data/BTW17_Suggestions/btw_politicians_demographic.csv')
persons_df.drop(columns=['Unnamed: 0', 'Born', 'Bundesland', 'Age'], inplace=True)
persons_df['Name'] = persons_df['Name'].apply(lambda x: x.lower())
persons_df.rename(columns={'Name':'queryterm', 'Party':'party', 'Gender':'gender'}, inplace=True)
persons_df.head(3)
Out[4]:
queryterm party gender
0 wolfgang stefinger CSU male
1 kai whittaker CDU male
2 katrin albsteiger CSU female
In [5]:
cluster_cat = pd.read_csv('../../data/BTW17_Suggestions/suggestions/cluster_categories.csv', delimiter=',')
cluster_cat.drop(columns='Unnamed: 0', inplace=True)
cluster_cat['size'] = cluster_cat['sugg'].apply(lambda x: x.count(', ')+1)
cluster_cat.head(3)
Out[5]:
cluster category sugg size
0 -1 Rauschen büro lorenz caffier, peter uldall juhl, cloud ... 6217
1 0 Rauschen gebrochen, stadt land fluss, konzert für dich,... 346
2 1 Personen sabine zeidler, birga köhler, rosemarie heinem... 256
In [6]:
# load suggestions timeseries
tmp = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
tmp['date'] = pd.to_datetime(tmp['date']).dt.date
suggestions_df = pd.DataFrame()
suggestions_df[['date', 'queryterm', 'suggestion', 'count']] = tmp.groupby(['date', 'queryterm', 'suggestion'], as_index=False).count()
suggestions_df = suggestions_df.merge(persons_df, how='left', on='queryterm')
In [7]:
# load vector similarites
similarity_df = pd.read_json('../../data/BTW17_Suggestions/suggestions/vector_similarity.json')
similarity_df['hashtags'] = [hashtags for i in similarity_df.index]
similarity_df['suggestion'] = similarity_df['suggestion'].apply(lambda x: ' '.join(x))
In [8]:
# join suggestion cluster and  group again
suggestions_df = suggestions_df.merge(similarity_df, how='inner', on='suggestion')
suggestions_df = suggestions_df.groupby(['date', 'queryterm', 'party', 'gender', 'cluster'], as_index=False).sum('count')
suggestions_df.head(3)
Out[8]:
date queryterm party gender cluster count
0 2017-05-29 achim post SPD male 2 4
1 2017-05-29 achim post SPD male 5 12
2 2017-05-29 achim post SPD male 75 4
In [9]:
# remodel similarity cluster to hashtags
similarity_df = similarity_df.set_index(['suggestion', 'cluster']).apply(pd.Series.explode).reset_index()
similarity_df['similarity_scores'] = pd.to_numeric(similarity_df['similarity_scores']) 
similarity_df = similarity_df.groupby(['cluster', 'hashtags'], as_index=False).mean('similarity_scores')
similarity_df = similarity_df.merge(cluster_cat, how='left', on='cluster')

# filter out category rauschen
similarity_df = similarity_df[similarity_df['category']!='Rauschen'].reset_index(drop=True)
similarity_df.head(3)
Out[9]:
cluster hashtags similarity_scores category sugg size
0 1 afdwählen 0.008258 Personen sabine zeidler, birga köhler, rosemarie heinem... 256
1 1 afghanistan -0.011473 Personen sabine zeidler, birga köhler, rosemarie heinem... 256
2 1 altersarmut -0.008137 Personen sabine zeidler, birga köhler, rosemarie heinem... 256
In [10]:
# prepare data for tlcc

# filter everything with sim_score < 0.5
sim_df = similarity_df[similarity_df['similarity_scores']>=0.5].reset_index(drop=True)

# group suggestions to cluster
cluster_df = suggestions_df.groupby(['date', 'cluster'], as_index=False).sum('count')
cluster_df.rename(columns={'count':'cluster_count'}, inplace=True)

# group suggestions per cluster and party
cluster_party_df = suggestions_df.groupby(['date', 'party', 'cluster'], as_index=False).sum('count')
cluster_party_df.rename(columns={'count':'cluster_count'}, inplace=True)

# group suggestions per cluster and gender
cluster_gender_df = suggestions_df.groupby(['date', 'gender', 'cluster'], as_index=False).sum('count')
cluster_gender_df.rename(columns={'count':'cluster_count'}, inplace=True)

hashtag_df.rename(columns={'count':'hashtag_count'}, inplace=True)
In [11]:
colors = px.colors.qualitative.Antique
colors.extend(px.colors.qualitative.Antique)

Time Lagged Cross Correlation

In [12]:
delays = []
for i in range(0, 71, 7):
    delays.append(i)
In [13]:
#dfs = []
#for i in delays:
#    dfs.append(get_correlation(i, hashtag_df, cluster_df, cluster_gender_df, cluster_party_df, sim_df))
In [14]:
#for i in range(len(dfs)):
#    dfs[i].to_json(f'../../data/Analysis/df_{delays[i]}_delays.json')
In [15]:
# set to *.json to load all
input_loc = '../../data/Analysis/*delays.json'
input_files = glob.glob(input_loc)

dfs = []
for file in input_files:
    data = pd.read_json(file)
    data = data.merge(cluster_cat, how='left', on='cluster')
    #data = data[(data['pearsonr']>=0)&(data['p_value']<=0.05)&(data['gender']=='all')&(data['party']=='all')]
    data = data[(data['pearsonr']>=0)]
    dfs.append(data)

Deskriptives

In [16]:
print(f'Anzahl möglicher Kombinationen: {len(similarity_df[similarity_df["category"]!="Rauschen"])}')
print(f'Anzahl relevanter Kombinationen: {len(sim_df)}')
print(f'Anzahl Kombinationen pro Hashtag: {len(sim_df)/sim_df["hashtags"].nunique()}')
print(f'Anteil relevanter Kombinationen: {round(len(sim_df[sim_df["category"]!="Rauschen"])/len(similarity_df[similarity_df["category"]!="Rauschen"])*100,2)}%')
Anzahl möglicher Kombinationen: 114696
Anzahl relevanter Kombinationen: 1050
Anzahl Kombinationen pro Hashtag: 6.481481481481482
Anteil relevanter Kombinationen: 0.92%

Kategorien der Suchvorschlag Cluster

In [17]:
for category in sim_df['category'].unique():
    tmp = sim_df[sim_df['category']==category]
    print(f'Kategorie: {category}, Anzahl relevanter Kombinationen: {tmp.groupby(["cluster", "hashtags"], as_index=False).ngroups}')
Kategorie: Personen, Anzahl relevanter Kombinationen: 690
Kategorie: Orte, Anzahl relevanter Kombinationen: 41
Kategorie: Politik, Anzahl relevanter Kombinationen: 108
Kategorie: Medizin, Anzahl relevanter Kombinationen: 12
Kategorie: Organisationen, Anzahl relevanter Kombinationen: 18
Kategorie: Medien, Anzahl relevanter Kombinationen: 25
Kategorie: Wirtschaft, Anzahl relevanter Kombinationen: 52
Kategorie: Berufe, Anzahl relevanter Kombinationen: 104
In [18]:
sim_df.groupby('category', as_index=False)['similarity_scores'].mean()
Out[18]:
category similarity_scores
0 Berufe 0.720236
1 Medien 0.608040
2 Medizin 0.511738
3 Organisationen 0.572786
4 Orte 0.596317
5 Personen 0.588212
6 Politik 0.568747
7 Wirtschaft 0.578236
In [19]:
# load cluster_df and join categories
cluster_cat_df = pd.read_json('../../data/BTW17_Suggestions/suggestions/cluster.json')
cluster_cat_df = cluster_cat_df.merge(cluster_cat, how='left', on='cluster')

tmp = pd.DataFrame()
tmp['cluster'] = cluster_cat_df['cluster'].value_counts().index
tmp['Clustergröße'] = cluster_cat_df['cluster'].value_counts().values
tmp = tmp.merge(cluster_cat[['cluster', 'category']], how='left', on='cluster')
tmp = tmp[tmp['category']!='Rauschen']
tmp2 = cluster_cat_df.groupby('category', as_index=False)['cluster'].nunique().sort_values(by='cluster', ascending=False)
tmp2.rename(columns={'cluster': 'n_cluster'}, inplace=True)
tmp = tmp.merge(tmp2, on='category')

tmp.rename(columns={'category':'Kategorie', 'cluster':'Cluster', 'n_cluster':'Anzahl Cluster'}, inplace=True)
In [20]:
tmp.groupby('Kategorie', as_index=False).mean()
Out[20]:
Kategorie Cluster Clustergröße Anzahl Cluster
0 Berufe 354.888889 19.555556 18.0
1 Justiz 279.000000 27.125000 8.0
2 Medien 359.818182 16.090909 22.0
3 Medizin 400.571429 16.142857 7.0
4 Organisationen 409.904762 18.666667 21.0
5 Orte 336.730769 31.410256 78.0
6 Personen 388.914081 22.069212 419.0
7 Politik 368.113636 23.784091 88.0
8 Privatleben 456.833333 23.916667 12.0
9 Wirtschaft 355.428571 23.142857 35.0
In [21]:
fig = px.scatter(cluster_cat_df, x='t-SNE(x)', y='t-SNE(y)', color='category', hover_name='suggestion',
                 template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

Regressions-Analysen unterschiedlicher Dimensionen (Einflüsse auf die mittleren Similarity Scores)

In [22]:
# regressionsanalyse
reg_df = suggestions_df.groupby(['party', 'gender', 'cluster'], as_index=False).sum()
tmp = similarity_df.groupby(['cluster', 'category'], as_index=False).mean()
reg_df = reg_df.merge(tmp, how='left', on='cluster')
reg_df.dropna(inplace=True)
reg_df = reg_df.reset_index(drop=True)
reg_df.head(3)
Out[22]:
party gender cluster count category similarity_scores size
0 AFD female 2 534 Wirtschaft 0.038410 20.0
1 AFD female 9 428 Orte 0.013585 225.0
2 AFD female 13 25 Wirtschaft 0.015126 306.0
In [23]:
reg = smf.ols('similarity_scores ~ size + C(party) + C(gender) + C(category)', data=reg_df).fit()
reg.summary()
Out[23]:
OLS Regression Results
Dep. Variable: similarity_scores R-squared: 0.052
Model: OLS Adj. R-squared: 0.045
Method: Least Squares F-statistic: 7.234
Date: Sun, 23 Jan 2022 Prob (F-statistic): 2.83e-20
Time: 16:58:30 Log-Likelihood: 5575.9
No. Observations: 2656 AIC: -1.111e+04
Df Residuals: 2635 BIC: -1.099e+04
Df Model: 20
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 0.0116 0.005 2.550 0.011 0.003 0.021
C(party)[T.CDU] -0.0038 0.003 -1.226 0.220 -0.010 0.002
C(party)[T.CSU] -0.0033 0.003 -0.991 0.322 -0.010 0.003
C(party)[T.DIE LINKE] -0.0039 0.003 -1.175 0.240 -0.010 0.003
C(party)[T.FDP] -5.014e-06 0.004 -0.001 0.999 -0.008 0.008
C(party)[T.GRÜNE] -0.0023 0.003 -0.719 0.472 -0.009 0.004
C(party)[T.Parteilos] -0.0012 0.005 -0.249 0.803 -0.011 0.008
C(party)[T.SPD] -0.0029 0.003 -0.950 0.342 -0.009 0.003
C(party)[T.SSW] 0.0017 0.012 0.143 0.886 -0.021 0.025
C(party)[T.fraktionslos] -0.0030 0.007 -0.442 0.658 -0.017 0.010
C(gender)[T.male] 0.0004 0.001 0.316 0.752 -0.002 0.003
C(category)[T.Justiz] -0.0073 0.006 -1.284 0.199 -0.018 0.004
C(category)[T.Medien] -0.0310 0.004 -7.146 0.000 -0.039 -0.022
C(category)[T.Medizin] -0.0205 0.006 -3.550 0.000 -0.032 -0.009
C(category)[T.Organisationen] -0.0047 0.004 -1.090 0.276 -0.013 0.004
C(category)[T.Orte] -0.0101 0.004 -2.805 0.005 -0.017 -0.003
C(category)[T.Personen] -0.0042 0.003 -1.197 0.231 -0.011 0.003
C(category)[T.Politik] -0.0035 0.004 -0.946 0.344 -0.011 0.004
C(category)[T.Privatleben] 0.0051 0.005 1.131 0.258 -0.004 0.014
C(category)[T.Wirtschaft] 0.0031 0.004 0.748 0.454 -0.005 0.011
size -6.719e-06 1.25e-05 -0.539 0.590 -3.12e-05 1.77e-05
Omnibus: 97.716 Durbin-Watson: 2.018
Prob(Omnibus): 0.000 Jarque-Bera (JB): 265.080
Skew: -0.109 Prob(JB): 2.75e-58
Kurtosis: 4.532 Cond. No. 1.25e+03


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.25e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

Ergebnisse der TLCC

Fragestellung: Wie lange dauert die Durchdringung im Durchschnitt und nach den jeweiligen Dimensionen? Messung: TLCC mit Pearson R und p-Wert

Betrachtung über alle Kombinationen

In [24]:
# übersicht der korrelationen und deren p-werte pro time lag
delay_list = []
r_list = []
p_list = []

for i in range(len(dfs)):
    delay_list.append(int(delays[i]/7))
    df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']=='all')]
    r_list.append(round(df['pearsonr'].mean(),3))
    p_values = df['p_value'].to_numpy()
    p_list.append(round(stats.combine_pvalues(p_values)[1],3))
    
tmp = pd.DataFrame(data={'Time Lag (in Wochen)': delay_list, 'Pearson R': r_list, 'P-Wert': p_list})
tmp
Out[24]:
Time Lag (in Wochen) Pearson R P-Wert
0 0 0.120 0.0
1 1 0.122 0.0
2 2 0.119 0.0
3 3 0.131 0.0
4 4 0.141 0.0
5 5 0.131 0.0
6 6 0.161 0.0
7 7 0.170 0.0
8 8 0.163 0.0
9 9 0.171 0.0
10 10 0.116 0.0

Sämtliche Korrelationen sind signifikant (p<0.05), deshalb Betrachtung im Plot.

In [25]:
fig = px.line(tmp, x='Time Lag (in Wochen)', y='Pearson R',
              template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

Plateau zwischen 6-9 Wochen, allerdings sehr geringe Korrelation. Ausnahmen bei 9 Wochen sind nur wenige:

In [26]:
tmp = dfs[9][(dfs[9]['gender']=='all')&(dfs[9]['party']=='all')]
tmp = tmp[tmp['pearsonr']>=0.5]
tmp.sort_values(by='pearsonr', ascending=False)[['cluster', 'hashtags', 'category_x', 'pearsonr', 'similarity_scores']]
Out[26]:
cluster hashtags category_x pearsonr similarity_scores
9814 711 bureg Personen 0.735616 0.675571
2482 387 btw2017 Personen 0.666920 0.660750
4679 505 btw2017 Personen 0.651856 0.512437
2508 387 bundestagswahl Personen 0.627181 0.692417
4705 505 bundestagswahl Personen 0.622913 0.554438
7513 620 btw Personen 0.589454 0.544000
7656 620 linke Personen 0.542172 0.563857
2950 387 traudichdeutschland Personen 0.541448 0.603667
2534 387 darumgrün Personen 0.539766 0.552000
4380 490 populismus Organisationen 0.533656 0.564286
7422 594 steineke Personen 0.513590 0.539500
2469 387 btw17 Personen 0.505943 0.603667
8397 671 islamisierung Orte 0.503226 0.658875

Betrachtung nach Kategorie der Cluster

In [27]:
delay_list = []
categories = []
r_list = []
p_list = []

for i in range(len(dfs)):
    for category in set(similarity_df['category']):
        delay_list.append(delays[i])
        df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']=='all')]
        categories.append(category)
        r_list.append(df[df['category_x']==category]['pearsonr'].mean())
        p_values = df[df['category_x']==category]['p_value'].to_numpy()
        p_list.append(stats.combine_pvalues(p_values)[1])

tmp = pd.DataFrame(data={'Delay': delay_list, 'Kategorie': categories, 'Pearson R': r_list, 'P-Wert': p_list})
tmp = tmp.dropna()
tmp = tmp[tmp['Kategorie']!='Rauschen']

fig = make_subplots(rows=1, cols=2, subplot_titles=('Pearson R', 'P-Werte'),
                    shared_yaxes=True, horizontal_spacing=0.15)

fig.add_trace(go.Heatmap(z=tmp['Pearson R'], x=tmp['Kategorie'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu, colorbar_x=0.45), row=1, col=1)

fig.add_trace(go.Heatmap(z=tmp['P-Wert'], x=tmp['Kategorie'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu_r), row=1, col=2)

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

Höchste signifikante Korrelation im Bereich zwischen 6-9 Wochen. Insbesondere Personen und Wirtschaft. Bestätigung der Ergebnisse der Gesamtbetrachtung.

Betrachtung nach Geschlecht der Personen im Suchterm

In [28]:
delay_list = []
gender_list = []
r_list = []
p_list = []

for i in range(len(dfs)):
    for gender in set(suggestions_df['gender']):
        delay_list.append(delays[i])
        df = dfs[i][(dfs[i]['gender']!='all')&(dfs[i]['party']=='all')]
        gender_list.append(gender)
        df = df[df['category_x']!='Rauschen']
        r_list.append(df[df['gender']==gender]['pearsonr'].mean())
        p_values = df[df['gender']==gender]['p_value'].to_numpy()
        p_list.append(stats.combine_pvalues(p_values)[1])

tmp = pd.DataFrame(data={'Delay': delay_list, 'Gender': gender_list, 'Pearson R': r_list, 'P-Wert': p_list})
tmp = tmp.dropna()

fig = make_subplots(rows=1, cols=2, subplot_titles=('Pearson R', 'P-Werte'),
                    shared_yaxes=True, horizontal_spacing=0.15)

fig.add_trace(go.Heatmap(z=tmp['Pearson R'], x=tmp['Gender'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu, colorbar_x=0.45), row=1, col=1)

fig.add_trace(go.Heatmap(z=tmp['P-Wert'], x=tmp['Gender'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu_r), row=1, col=2)

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

Bestätigung der Ergebnisse aus der Gesamtbetrachtung.

Betrachtung nach Partei der Person im Suchterm

In [29]:
delay_list = []
party_list = []
r_list = []
p_list = []

for i in range(len(dfs)):
    for party in set(suggestions_df['party']):
        delay_list.append(delays[i])
        df = dfs[i][(dfs[i]['gender']=='all')&(dfs[i]['party']!='all')]
        party_list.append(party)
        df = df[df['category_x']!='Rauschen']
        r_list.append(df[df['party']==party]['pearsonr'].mean())
        p_values = df[df['party']==party]['p_value'].to_numpy()
        p_list.append(stats.combine_pvalues(p_values)[1])

tmp = pd.DataFrame(data={'Delay': delay_list, 'Parteien': party_list, 'Pearson R': r_list, 'P-Wert': p_list})
tmp = tmp.dropna()

fig = make_subplots(rows=1, cols=2, subplot_titles=('Pearson R', 'P-Werte'),
                    shared_yaxes=True, horizontal_spacing=0.15)

fig.add_trace(go.Heatmap(z=tmp['Pearson R'], x=tmp['Parteien'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu, colorbar_x=0.45), row=1, col=1)

fig.add_trace(go.Heatmap(z=tmp['P-Wert'], x=tmp['Parteien'],
                         y=tmp['Delay'],
                         colorscale=px.colors.sequential.RdBu_r), row=1, col=2)

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

Grobe Bestätigung der Ergebnisse der Gesamtbetrachtung. Ausnahme AfD: Immer stark korreliert, ggf Partei mit starker Abhängigkeit von Twitter?

t-Tests der Mittelwerte der Tage um die Peaks

In [30]:
peaks_df = pd.read_json('../../data/BTW17_Twitter/peaks/peak_dates.json')
peaks_df['num_peaks'] = peaks_df.apply(lambda x: len(x['lda_dates']) / 7, axis=1)
peaks_df[['peak_start', 'peak_end']] = peaks_df.apply(peak_ranges, axis=1)
peaks_df.drop(columns=['index', 'num_peaks', 'lda_dates'], inplace=True)
peaks_df = peaks_df.set_index(['hashtag']).apply(pd.Series.explode).reset_index()
peaks_df.head(3)
Out[30]:
hashtag peak_start peak_end
0 afghanistan 2017-05-29 2017-06-04
1 afghanistan 2017-08-22 2017-08-28
2 armut 2017-07-03 2017-07-09
In [31]:
cluster_ts_df = suggestions_df.groupby(['date', 'cluster', 'party', 'gender'], as_index=False).sum('count')
cluster_ts_df = cluster_ts_df.merge(cluster_cat[['cluster', 'category']], how='left', on='cluster')
cluster_ts_df.head(3)
Out[31]:
date cluster party gender count category
0 2017-05-29 0 AFD female 16 Rauschen
1 2017-05-29 0 AFD male 97 Rauschen
2 2017-05-29 0 CDU female 532 Rauschen
In [32]:
analysis_dfs = []

for i in tqdm(range(len(delays[1:]))):
    test_range = delays[i+1]
    tmp = peak_analysis(test_range, sim_df, peaks_df, cluster_ts_df)
    tmp = tmp.reset_index(drop=True)
    analysis_dfs.append(tmp)
In [33]:
# save files
for i in range(len(analysis_dfs)):
    analysis_dfs[i].to_json(f'../../data/Analysis/peak_analysis_detail_range_{delays[i]}.json')
In [34]:
# set to *.json to load all
input_loc = '../../data/Analysis/peak_analysis*.json'
input_files = glob.glob(input_loc)

analysis_dfs = []
for file in input_files:
    data = pd.read_json(file)
    analysis_dfs.append(data)
In [42]:
for i in range(len(analysis_dfs)):
    tmp = analysis_dfs[i]
    test_range = tmp['test_range'].unique()
    a = tmp[tmp['time']=='after']['count']
    b = tmp[tmp['time']=='before']['count']
    results = stats.ttest_ind(a,b, equal_var=False)
    print(tmp.groupby('time', as_index=False).mean()[['time', 'count']])
    print(f'Test Range: {test_range}, t: {results[0]}, p: {results[1]}\n')
     time      count
0   after  15.483540
1  before  15.471336
Test Range: [7], t: 0.06244940490777691, p: 0.9502054819630756

     time      count
0   after  15.137195
1  before  15.187345
Test Range: [21], t: -0.26428452589512047, p: 0.7915629114571427

     time      count
0   after  15.060328
1  before  15.070531
Test Range: [28], t: -0.054207974689472, p: 0.9567699039965558

     time      count
0   after  14.986474
1  before  15.018879
Test Range: [35], t: -0.1730405467201768, p: 0.8626209569115229

     time      count
0   after  14.929444
1  before  14.970720
Test Range: [42], t: -0.22153269373478351, p: 0.8246794771290584

     time      count
0   after  14.852004
1  before  14.904595
Test Range: [49], t: -0.284077678467178, p: 0.7763531649380988

     time      count
0   after  14.766858
1  before  14.820700
Test Range: [56], t: -0.2929710015660142, p: 0.7695466861409888

     time      count
0   after  14.694571
1  before  14.738089
Test Range: [63], t: -0.23856538878544817, p: 0.8114444970951588

     time      count
0   after  14.622837
1  before  14.661717
Test Range: [70], t: -0.21468437521921363, p: 0.8300150887196274

     time      count
0   after  15.273014
1  before  15.354938
Test Range: [14], t: -0.42611708259286196, p: 0.6700263479896748

In [36]:
reg = smf.ols('count ~ C(party)*C(time) + C(gender)*C(time) + C(category)*C(time) + C(time)', data=tmp).fit()
reg.summary()
Out[36]:
OLS Regression Results
Dep. Variable: count R-squared: 0.317
Model: OLS Adj. R-squared: 0.316
Method: Least Squares F-statistic: 360.2
Date: Sun, 23 Jan 2022 Prob (F-statistic): 0.00
Time: 17:16:53 Log-Likelihood: -94531.
No. Observations: 24052 AIC: 1.891e+05
Df Residuals: 24020 BIC: 1.894e+05
Df Model: 31
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 0.7765 0.815 0.953 0.341 -0.821 2.374
C(party)[T.CDU] 6.2854 0.641 9.800 0.000 5.028 7.542
C(party)[T.CSU] -5.4799 0.693 -7.905 0.000 -6.839 -4.121
C(party)[T.DIE LINKE] -0.8121 0.691 -1.176 0.240 -2.166 0.542
C(party)[T.FDP] -3.6010 0.898 -4.010 0.000 -5.361 -1.841
C(party)[T.GRÜNE] -1.3362 0.681 -1.961 0.050 -2.672 -0.001
C(party)[T.Parteilos] 1.9386 0.874 2.218 0.027 0.225 3.652
C(party)[T.SPD] 9.5385 0.633 15.077 0.000 8.298 10.779
C(party)[T.fraktionslos] 7.1810 1.474 4.872 0.000 4.292 10.070
C(time)[T.before] 1.4763 1.169 1.263 0.207 -0.815 3.768
C(gender)[T.male] 0.2809 0.275 1.023 0.306 -0.257 0.819
C(category)[T.Medizin] 5.4922 1.459 3.765 0.000 2.633 8.352
C(category)[T.Organisationen] 4.0916 0.779 5.250 0.000 2.564 5.619
C(category)[T.Orte] 7.2513 0.587 12.355 0.000 6.101 8.402
C(category)[T.Personen] 8.6125 0.512 16.837 0.000 7.610 9.615
C(category)[T.Politik] 10.7874 0.520 20.731 0.000 9.767 11.807
C(category)[T.Wirtschaft] 31.1427 0.576 54.086 0.000 30.014 32.271
C(party)[T.CDU]:C(time)[T.before] -1.3854 0.930 -1.490 0.136 -3.207 0.437
C(party)[T.CSU]:C(time)[T.before] -0.0813 1.000 -0.081 0.935 -2.042 1.879
C(party)[T.DIE LINKE]:C(time)[T.before] -1.6677 0.998 -1.671 0.095 -3.624 0.289
C(party)[T.FDP]:C(time)[T.before] -0.0188 1.301 -0.014 0.988 -2.569 2.532
C(party)[T.GRÜNE]:C(time)[T.before] -0.1163 0.985 -0.118 0.906 -2.047 1.814
C(party)[T.Parteilos]:C(time)[T.before] -1.2330 1.264 -0.976 0.329 -3.710 1.244
C(party)[T.SPD]:C(time)[T.before] -1.5224 0.918 -1.658 0.097 -3.322 0.277
C(party)[T.fraktionslos]:C(time)[T.before] -2.2656 2.094 -1.082 0.279 -6.370 1.838
C(gender)[T.male]:C(time)[T.before] -0.1918 0.387 -0.496 0.620 -0.950 0.567
C(category)[T.Medizin]:C(time)[T.before] 0.6407 2.005 0.320 0.749 -3.290 4.571
C(category)[T.Organisationen]:C(time)[T.before] 0.3434 1.102 0.312 0.755 -1.817 2.504
C(category)[T.Orte]:C(time)[T.before] -0.6830 0.829 -0.824 0.410 -2.307 0.941
C(category)[T.Personen]:C(time)[T.before] 0.1700 0.725 0.235 0.815 -1.251 1.590
C(category)[T.Politik]:C(time)[T.before] -0.0222 0.736 -0.030 0.976 -1.466 1.421
C(category)[T.Wirtschaft]:C(time)[T.before] -2.0228 0.813 -2.487 0.013 -3.617 -0.428
Omnibus: 5152.635 Durbin-Watson: 2.098
Prob(Omnibus): 0.000 Jarque-Bera (JB): 14409.100
Skew: 1.137 Prob(JB): 0.00
Kurtosis: 6.035 Cond. No. 62.0


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.